
# Overview ---------------------------------------------------------------------
#   Cleaning up the baseline quantity files.


# Setup ------------------------------------------------------------------------
  # Options
  options(stringsAsFactors = FALSE)
  # Libraries
  library(data.table)
  library(magrittr)
  library(lubridate)
  library(XLConnect)
  # Directories
  dir_project <- "/Users/edwardarubin/Dropbox/Research/MyProjects/NaturalGas/"
  dir_data_raw <- paste0(dir_project, "DataRaw/")
  dir_data_csv <- paste0(dir_project, "DataCsv/")


# Load and clean PGE's combined baseline file ----------------------------------
  # Open the combined baseline file (from PGE)
  base_dt <- readWorksheetFromFile(
    file = paste0(dir_data_raw, "pgeBaselinesCombined.xlsx"),
    sheet = 1) %>% data.table()
  # Clean up baseline columns: remove "*" and "**"; convert to numeric
  base_dt[, `:=`(
    base_ind = base_ind %>%
      gsub(pattern = "[^0-9\\.]", replacement = "") %>%
      as.numeric(),
    base_master = base_master %>%
      gsub(pattern = "[^0-9\\.]", replacement = "") %>%
      as.numeric()
    )]
  # Clean up date columns: convert to "Date" format
  base_dt[, `:=`(
    season_start_month = season_start %>% ymd() %>% month(),
    season_start_day   = season_start %>% ymd() %>% day(),
    season_stop_month  = season_stop %>% ymd() %>% month(),
    season_stop_day    = season_stop %>% ymd() %>% day(),
    eff_start          = eff_start %>% ymd() %>% as.Date(),
    eff_stop           = eff_stop %>% ymd() %>% as.Date()
    )][, c("season_start", "season_stop") := NULL]
  # Convert base_dt from "effective period" level to month level
  #   (This conversion helps with joining baseline data to billing data)
  start_dates <- base_dt$eff_start %>% unique()
  month_dt <- lapply(X = seq_along(start_dates),
    FUN = function(i) {
      # Grab start date associated with index i
      start_date <- start_dates[i]
      # Subset of the data that have the same "effective period"
      sub_dt <- base_dt[eff_start == start_date]
      # Grab the seasons' start/stop months
      winter <- sub_dt[season == "winter",
        .(season_start_month, season_stop_month)] %>% unique() %>% unlist()
      summer <- sub_dt[season == "summer",
        .(season_start_month, season_stop_month)] %>% unique() %>% unlist()
      # Add 12 to winter's stop
      winter[2] <- winter[2] + 12
      # Expand the months from beginning to end of season
      season_dt <- rbindlist(list(
        data.table(month = summer[1]:summer[2], season = "summer"),
        data.table(month = winter[1]:winter[2], season = "winter")
        ))
      # Change months > 12 to < 12
      season_dt[month > 12, month := month - 12]
      # The end of the effective period
      stop_date <- sub_dt$eff_stop[1]
      # Create data.table spanning the first to last month of the eff. period
      monthly_dt <- data.table(month_date =
        seq.Date(start_date, stop_date, by = "month"))
      # Add a column for the month
      monthly_dt[, month := month_date %>% month()]
      # Merge the seasons onto the months
      monthly_dt %<>% merge(y = season_dt, by = "month", sort = FALSE)
      # Merge the monthly data with the baseline subset
      monthly_dt %<>% merge(y = sub_dt, by = "season", sort = FALSE,
        allow.cartesian = TRUE)
      # Return monthly_dt
      return(monthly_dt)
      }) %>% rbindlist()
  # Sort the data.table by month
  setorder(month_dt, month_date, territory)
  # Drop unwanted columns
  month_dt[, c("season_start_month", "season_start_day",
    "season_stop_month", "season_stop_day") := NULL]
  # Save as .csv
  write.csv(month_dt, paste0(dir_data_csv, "pgeBaselinesMonthly.csv"),
    row.names = FALSE)
